# global parameters
## data generation
ksize_vec <- seq(1, 9, by = 2) # filter size
alpha_vec <- c(0, 0.5, 1, 2.5, 5) # weight matrix
nS <- 32 # space grid
Nf <- 1000 # generate 1000 subjects over all, but use only 100 for the block bootstrap
N <- 100 # 100 subjects for block bootstrap
## block bootstrap
max_size <- 9 # max block
M <- 1000 # number of bootstrap iter
source(here("Code/RandomField.R"))
source(here("Code/BlockBoot.R"))
The simulation, technically, would be established on a continuous space-time domain, even though the final “observations” would be a discrete realization.
Take a Gaussian process for example:
\[Y_i(\mathbf{s}, t) = \eta_i(\mathbf{s}, t) +\epsilon_i(\mathbf{s}, t)\] 2. The true latent process is composed of a fixed process and a random (subject-specific) process.
\[\eta_i(\mathbf{s}, t) = \mu(\mathbf{s}, t)+b_i(\mathbf{s}, t)\]
\[\epsilon_i(\mathbf{s}, t) = \frac{1}{N_r}\sum_{\mathbf{s'} \in S_r}Z(\mathbf{s'}, t)\]
where:
In this section, we would like to see if the filter size of moving average would affect the spatial correlation. Here we generate 2D image of 32 by 32. Note that filter size should be odd numbers, a convention inherited from image analysis. Thought even size filter is technically possible, it is much more convenienve to use odd size for many operations since it would have a center pixel
ma_ksize <- expand_grid(s2=1:32, s1=1:32)
for(i in seq_along(ksize_vec)){
MAerr_mat <- MA_rand_field(kf = ksize_vec[i], ki = 32)
ma_ksize[, paste0("k", ksize_vec[i])] <- as.vector(MAerr_mat)
}
ma_ksize %>% pivot_longer(starts_with("k")) %>%
ggplot()+
geom_tile(aes(x=s1, y=s2, fill=value))+
facet_wrap(~name, nrow = 1)
Below I plot the variogram function at different kernel sizes. Variogram is a measure of spatial dependence as a function of distance. Smaller value indicates greater dependence.
bind_rows(
variogram(k1~1, locations = ~s1+s2, data = ma_ksize) %>% mutate(ksize = 1),
variogram(k3~1, locations = ~s1+s2, data = ma_ksize) %>% mutate(ksize = 3),
variogram(k5~1, locations = ~s1+s2, data = ma_ksize) %>% mutate(ksize = 5),
variogram(k7~1, locations = ~s1+s2, data = ma_ksize) %>% mutate(ksize = 7),
variogram(k9~1, locations = ~s1+s2, data = ma_ksize) %>% mutate(ksize = 9)
) %>%
ggplot()+
geom_line(aes(x=dist, y=gamma, col=as.factor(ksize)))+
labs(y="Sample variagram", col = "Filter size")
Some observations:
Here, I would like to use weighted average within a filter. Let’s fix the kernel size to be 5, and explore effect of difference weight matrix.
I would like to construct a weight matrix that decay fast from center to border. Let’s try using inverse of the exponential of the square of Euclidean distance to center:
\[w_{ij} = \frac{exp(-\alpha d_{ij}^2)}{\sum_{i,j=1}^5 exp(-\alpha d_{ij}^2)}\]
# proportional to inverse exponential Euclidian to center (to avoid zero demonimator)
wt_dist <- array(NA, dim = c(5, 5, length(alpha_vec)))
for(m in seq_along(alpha_vec)){
for(i in 1:5){
for(j in 1:5){
wt_dist[i,j, m] <- exp(-alpha_vec[m]*((i-3)^2+(j-3)^2))
}
}
# normalization
wt_dist[,,m] <- wt_dist[,,m]/sum(wt_dist[,,m])
}
df_wt <- expand.grid(s2=1:5, s1=1:5)
for(m in seq_along(alpha_vec)){
df_wt[, paste0("alpha", alpha_vec[m])] <- as.vector(wt_dist[,,m])
}
df_wt %>%
pivot_longer(starts_with("alpha")) %>%
mutate(name=fct_rev(name)) %>%
ggplot()+
geom_tile(aes(x=s1, y=s2, fill=value))+
facet_wrap(~name, nrow = 1)+
labs(title = "Weight matrix")
Here, when \(\alpha=5\), the boarder weight can get very, very close to zero (corner pixel: 4.14e-18). The center pixel takes 97.36% of the weight. When \(\alpha=0\), the weight matrix gets us a simple average. As \(\alpha\) increase, the filter put more weight on the center pixel and less weight on the boarder pixels, leading to lower spatial dependence.
ma_wt <- expand_grid(s2=1:32, s1=1:32)
for(m in seq_along(alpha_vec)){
this_wt <- wt_dist[,,m]
MAerr_mat <- MWA_rand_field(kf = 5, ki = 32, wt = this_wt)
ma_wt[, paste0("alpha", alpha_vec[m])] <- as.vector(MAerr_mat)
}
ma_wt %>% pivot_longer(starts_with("alpha")) %>%
mutate(name=fct_rev(name)) %>%
ggplot()+
geom_tile(aes(x=s1, y=s2, fill=value))+
facet_wrap(~name, nrow = 1)
bind_rows(
variogram(alpha0~1, locations = ~s1+s2, data = ma_wt) %>% mutate(alpha = 0),
variogram(alpha0.5~1, locations = ~s1+s2, data = ma_wt) %>% mutate(alpha = 0.5),
variogram(alpha1~1, locations = ~s1+s2, data = ma_wt) %>% mutate(alpha = 1),
variogram(alpha2.5~1, locations = ~s1+s2, data = ma_wt) %>% mutate(alpha = 2.5),
variogram(alpha5~1, locations = ~s1+s2, data = ma_wt) %>% mutate(alpha = 5)
) %>%
mutate(alpha=fct_rev(as.factor(alpha))) %>%
ggplot()+
geom_line(aes(x=dist, y=gamma, col=alpha))+
labs(y="Sample variagram", col = "Alpha")
Follow up on last time, we would like to generate data from the null hypothesis where \(Y_1\), \(Y_2\) are not correlated with each other. We will also remove any fixed/random effect, leaving only the moving average error, in case any unexpected correlation is induced
We generate data from the following scheme:
\[\begin{aligned} Y_{1i}(s_1, s_2, t) &=\epsilon_{1i}(s_1, s_2, t) \\ Y_{2i}(s_1, s_2, t) &=\epsilon_{2i}(s_1, s_2, t) \\ \end{aligned}\]
# set up space-time grid
# generate a 2D image of 32 by 32
df_subj <- expand_grid(ksize = ksize_vec, id=1:Nf, s2=1:nS, s1 = 1:nS)
df_subj$Y1 <- df_subj$Y2 <- NA
# generate individual scores
# true_xi <- matrix(rnorm(2*N, 0, 1.5), nrow = N, ncol = 2)
# true_zeta <- matrix(rnorm(2*N, 0, 1.5), nrow = N, ncol = 2)
# generate outcomes
pb <- txtProgressBar(min=0, max=Nf, style = 3)
t1 <- Sys.time()
for(i in 1:Nf){ # fix a subject
for(k in seq_along(ksize_vec)){ # fix a time point
# random effect of this subject at this time
# dist_it <- df_subj$dist[df_subj$id==i & df_subj$t==this_t]
# generate Y1
## a moving average error
Y1 <- MA_rand_field(ksize_vec[k], nS)
# y1_it <- true_xi[i,1]*dist_it+true_xi[i,2]*this_t + as.vector(ma_err1)
df_subj$Y1[df_subj$ksize==ksize_vec[k] & df_subj$id==i] <- as.vector(Y1)
# generate Y2
## a moving average error
Y2 <- MA_rand_field(ksize_vec[k], nS)
# y2_it <- true_zeta[i,1]*dist_it+true_zeta[i,2]*this_t + as.vector(ma_err2)
df_subj$Y2[df_subj$ksize==ksize_vec[k] & df_subj$id==i] <- as.vector(Y2)
}
setTxtProgressBar(pb, i)
}
t2 <- Sys.time()
close(pb)
# save data
# save(df_subj, file = here("Data/sim_data_ksize.RData"))
It took 7.154 minutes to generate data for 100 subjects. Below we show an example of one subject.
df_subj %>%
filter(id==15) %>%
pivot_longer(starts_with("Y")) %>%
ggplot()+
geom_tile(aes(x=s1, y=s2, fill = value))+
facet_grid(cols=vars(ksize), rows = vars(name))+
labs(title = "Generated data of subject ID = 15")
We fit a simple linear model across space conditioning on each subject and time:
\[Y_{2i}(\mathbf{s}|t) = \beta_{it0}+\beta_{it1}Y_{1i}(\mathbf{s}|t)+\epsilon_i(\mathbf{s}|t)\]
df_subj %>%
filter(id %in% sample(1:Nf, size = 4)) %>%
ggplot(aes(x=Y1, y=Y2))+
geom_point(size = 0.2)+
geom_smooth(formula = 'y~x', method = "lm")+
stat_cor(method = "pearson")+
facet_grid(rows = vars(id), cols = vars(ksize))+
labs(title = "Perason correlation of four subject")
lm_err <- data.frame(ksize = ksize_vec)
N_vec <- c(100, 200, 500, 1000)
for(n in seq_along(N_vec)){
err_n <- df_subj %>%
filter(id <= N_vec[n]) %>%
group_by(id, ksize) %>%
group_modify(~{
fit_lm <- lm(Y2~Y1, data = .x)
data.frame(beta = coef(fit_lm)["Y1"],
pval = summary(fit_lm)$coefficients["Y1", "Pr(>|t|)"])
}) %>%
mutate(reject = pval < 0.05) %>%
group_by(ksize) %>%
summarise_at("reject", mean)
lm_err[, paste0("N", N_vec[n])] <- err_n$reject
}
lm_err %>%
rename("Filter size" = "ksize") %>%
kable(table.attr = "style=\"color:black;\"") %>%
kable_styling(full_width = F) %>%
add_header_above(c(" " = 1, "Type I error" = "4"))
| Filter size | N100 | N200 | N500 | N1000 |
|---|---|---|---|---|
| 1 | 0.01 | 0.025 | 0.038 | 0.047 |
| 3 | 0.35 | 0.395 | 0.382 | 0.353 |
| 5 | 0.55 | 0.535 | 0.524 | 0.561 |
| 7 | 0.61 | 0.680 | 0.664 | 0.645 |
| 9 | 0.79 | 0.770 | 0.750 | 0.742 |
After increasing the sample size to 1000, the k=1 case had 4.4% type I error, still slightly lower than the nominal value of 5%. Does it indicate that data generated under this scheme is more likely to have false rejection under simple linear regression? Would it be because of the spatial correlation, such as the spatial correlation introduced some kind of correlation between \(Y_1\) and \(Y_2\)?
We hope to bootstrap non-overlapping blocks of pixels to preserve some degree of correlation. While the image may not be evenly divided by the block size, we try to make the expectation of image size constant, so that each block will be sampled with equal probability. For the re-sampled image, regardless of its size, we will fit a simple linear regression model for each subject and examine the significance of the slope.
PS. Originally, we wanted to set the resampling probability to be propotional to image size. But in fact, I think this approach couldn’t make the expectation of the pixels as the same as the original image. It would keep the expected size of the sampled image unchanged if all blocks are sampled with equal probability.
Let’s say the original image has N pixels and B blocks, and we want to resample a new image with also B blocks:
\[E(N_{new}) = E(\sum_{b=1}^Bn_b^{new}) = \sum_{b=1}^BE(n_b^{new})=\sum_{b=1}^B\sum_{b=1}^Bn_bp_b = \sum_{b=1}^B\sum_{b=1}^B n_b\frac{n_b}{N}=B\frac{\sum_{b=1}^Bn_B^2}{N} \] If we set \(p_b = 1/B\), then
\[E(N_{new}) = E(\sum_{b=1}^Bn_b^{new}) = \sum_{b=1}^BE(n_b^{new})=\sum_{b=1}^B\sum_{b=1}^Bn_bp_b = \sum_{b=1}^B\sum_{b=1}^B n_b\frac{1}{B}=\sum_{b=1}^B\frac{N}{B} = N \]
df_subj_k <- df_subj %>% filter(id %in% 1:N)
# ksize_vec <- c(1, 3, 5)
# containers
# bootstrap image size
img_size <- array(NA, dim = c(length(ksize_vec), max_size, N, M))
slope_est <- array(NA, dim = c(length(ksize_vec), max_size, N, M))
# pval <- array(NA, dim = c(length(ksize_vec), length(bsize), N, M))
# dim(img_size)
# bootstrap
pb <- txtProgressBar(0, length(ksize_vec)*max_size*N, style = 3)
ct <- 0
t1 <- Sys.time()
for(k in seq_along(ksize_vec)){ # filter size for data generation
for(b in 1:max_size){ # block size for bootstrap
for(i in 1:N){ # for each subject
# A matrix of observed outcome
this_df <- df_subj_k %>% filter(ksize==ksize_vec[k] & id == i)
Y_mat <- matrix(this_df$Y2, nS, nS)
# divide the matrix into blocks
rblock <- (row(Y_mat)-1)%/%b+1
cblock <- (col(Y_mat)-1)%/%b+1
block_id_mat <- (rblock-1)*max(cblock) + cblock
nblock <- max(block_id_mat)
# sample blocks
# sample the same number of blocks as the original image
this_df$block_id <- as.vector(block_id_mat)
block_list <- split(this_df, f = this_df$block_id)
for(m in 1:M){
boot_block <- sample(1:nblock, size = nblock, replace = T)
boot_df <- bind_rows(block_list[boot_block])
img_size[k, b, i, m] <- nrow(boot_df)
# fit model
boot_lm <- summary(lm(Y2~Y1, data = boot_df))$coefficients
slope_est[k, b, i, m] <- boot_lm["Y1", "Estimate"]
}
ct <- ct+1
setTxtProgressBar(pb, ct)
}
}
}
t2 <- Sys.time()
# check expectation of image size
# apply(img_size[ , , , ], MARGIN = 1:3, FUN = mean)
# mean(img_size)
# calculate Type I error
lqt_mat <- apply(slope_est[ , , , ], MARGIN = 1:3, FUN =quantile, probs = 0.025)
uqt_mat <- apply(slope_est[ , , , ], MARGIN = 1:3, FUN =quantile, probs = 0.975)
reject_mat <- 0 < lqt_mat | uqt_mat < 0
typeIerror <- apply(reject_mat, 1:2, mean)
typeIerror <- data.frame(ksize = ksize_vec, typeIerror)
typeIerror %>% pivot_longer(starts_with("X")) %>%
mutate(bsize = as.numeric(gsub("X", "", name))) %>%
ggplot()+
geom_line(aes(x=bsize, y=value, group=as.factor(ksize), col=as.factor(ksize)))+
geom_point(aes(x=bsize, y=value, group=as.factor(ksize), col=as.factor(ksize)))+
geom_hline(yintercept = 0.05, col = "black", linetype = "dashed")+
labs(x="Block size", y="Type I error", col = "Filter size")+
scale_x_continuous(breaks = 1:max_size)
Below I show the mean and standard deviation of bootstrap slope estimates for all 100 subjects (each point represents a subject).
# mean slope estimates
slope_mean1 <- apply(slope_est, MARGIN = 1:3, FUN = mean)
array(aperm(slope_mean1, perm = c(1, 3, 2)),
dim =c(N*length(ksize_vec), max_size)) %>%
data.frame() %>%
mutate(id = rep(1:N, each = length(ksize_vec)),
ksize = rep(ksize_vec, N),
.before=1) %>%
pivot_longer(starts_with("X")) %>%
mutate(bsize = as.numeric(gsub("X", "", name))) %>%
ggplot()+
geom_boxplot(aes(x=bsize, y=value, group = bsize))+
geom_jitter(aes(x=bsize, y=value), size = 0.2)+
facet_wrap(~ksize, ncol = 1)+
labs(x="Block size", title="Bootstrap mean of slope estimates")+
scale_x_continuous(breaks = 1:max_size)+
geom_hline(yintercept = 0, col="red", linetype = "dashed")
# mean slope estimates
slope_sd1 <- apply(slope_est, MARGIN = 1:3, FUN = sd)
array(aperm(slope_sd1, perm = c(1, 3, 2)),
dim =c(N*length(ksize_vec), max_size)) %>%
data.frame() %>%
mutate(id = rep(1:N, each = length(ksize_vec)),
ksize = rep(ksize_vec, N),
.before=1) %>%
pivot_longer(starts_with("X")) %>%
mutate(bsize = as.numeric(gsub("X", "", name))) %>%
ggplot()+
geom_boxplot(aes(x=bsize, y=value, group = bsize))+
geom_jitter(aes(x=bsize, y=value), size = 0.2)+
facet_wrap(~ksize, ncol = 1)+
labs(x="Block size", title="Standard deviation of bootstrap estimates")+
scale_x_continuous(breaks = 1:max_size)
Below I also showed an example of the bootstrap slope estimates of one subject.
array(slope_est[,,15,], dim = c(length(ksize_vec)*max_size, M)) %>%
data.frame() %>%
mutate(ksize = rep(ksize_vec, max_size),
bsize = rep(1:max_size, each = length(ksize_vec)),
.before = 1) %>%
pivot_longer(starts_with("X")) %>%
ggplot()+
geom_boxplot(aes(x=bsize, y=value, group=bsize))+
geom_jitter(aes(x=bsize, y=value, group=bsize), size=0.2)+
facet_wrap(~ksize, ncol=1)+
labs(x="Block size", title = "Bootstrap slope estiamtes of subject 15")+
geom_hline(yintercept = 0, col="red", linetype = "dashed")
Here the two parameters of interest are the weight matrix (in data generation process) and the block size (in bootstrap process). I will use the same weight matrix as Section 1.2, and the same block size as Section 2.1.
Just like before, I will generate data for 1000 subject, but only use 100 for the block bootstrap portion. The filter size for generating data is fixed at 5.
# set up space-time grid
# generate a 2D image of 32 by 32
# nS <- 32
# Nf
df_subj2 <- expand_grid(alpha = alpha_vec, id=1:Nf, s2=1:nS, s1 = 1:nS)
# alpha_vec
# wt_dist
df_subj2$Y1 <- df_subj2$Y2 <- NA
# generate outcomes
pb <- txtProgressBar(min=0, max=Nf, style = 3)
t1 <- Sys.time()
for(i in 1:Nf){ # fix a subject
for(k in seq_along(alpha_vec)){ # fix a weight matrix
# generate Y1
## a moving average error
Y1 <- MWA_rand_field(kf = 5, ki = 32, wt = wt_dist[,,k])
df_subj2$Y1[df_subj2$alpha==alpha_vec[k] & df_subj2$id==i] <- as.vector(Y1)
# generate Y2
## a moving average error
Y2 <- MWA_rand_field(kf = 5, ki = 32, wt = wt_dist[,,k])
# y2_it <- true_zeta[i,1]*dist_it+true_zeta[i,2]*this_t + as.vector(ma_err2)
df_subj2$Y2[df_subj2$alpha==alpha_vec[k] & df_subj$id==i] <- as.vector(Y2)
}
setTxtProgressBar(pb, i)
}
t2 <- Sys.time()
close(pb)
# save data
# save(df_subj2, file = here("Data/sim_data_wt.RData"))
df_subj2 %>%
filter(id==15) %>%
mutate(alpha=fct_rev(as.factor(alpha))) %>%
pivot_longer(starts_with("Y")) %>%
ggplot()+
geom_tile(aes(x=s1, y=s2, fill = value))+
facet_grid(cols=vars(alpha), rows = vars(name))+
labs(title = "Generated data of subject ID = 15")
df_subj2 %>%
filter(id %in% sample(1:N, size = 4)) %>%
mutate(alpha=fct_rev(as.factor(alpha))) %>%
ggplot(aes(x=Y1, y=Y2))+
geom_point(size = 0.2)+
geom_smooth(formula = 'y~x', method = "lm")+
stat_cor(method = "pearson")+
facet_grid(rows = vars(id), cols = vars(alpha))+
labs(title = "Perason correlation of four subject")
lm_err2 <- data.frame(alpha = alpha_vec)
# N_vec <- c(100, 200, 500, 1000)
for(n in seq_along(N_vec)){
err_n <- df_subj2 %>%
filter(id <= N_vec[n]) %>%
group_by(id, alpha) %>%
group_modify(~{
fit_lm <- lm(Y2~Y1, data = .x)
data.frame(beta = coef(fit_lm)["Y1"],
pval = summary(fit_lm)$coefficients["Y1", "Pr(>|t|)"])
}) %>%
mutate(reject = pval < 0.05) %>%
group_by(alpha) %>%
summarise_at("reject", mean)
lm_err2[, paste0("N", N_vec[n])] <- err_n$reject
}
lm_err2 %>%
# rename("Alpha" = "ksize") %>%
arrange(desc(alpha)) %>%
mutate(alpha=fct_rev(as.factor(alpha))) %>%
kable(table.attr = "style=\"color:black;\"") %>%
kable_styling(full_width = F) %>%
add_header_above(c(" " = 1, "Type I error" = "4"))
| alpha | N100 | N200 | N500 | N1000 |
|---|---|---|---|---|
| 5 | 0.04 | 0.035 | 0.030 | 0.040 |
| 2.5 | 0.06 | 0.050 | 0.056 | 0.057 |
| 1 | 0.23 | 0.225 | 0.242 | 0.247 |
| 0.5 | 0.40 | 0.385 | 0.386 | 0.423 |
| 0 | 0.48 | 0.490 | 0.526 | 0.552 |
We will follow the same set up as Section 2.2.
# M <- 1000 # number of boostraps
# N <- 100 # sample size
df_subj_wt <- df_subj2 %>% filter(id %in% 1:N)
# max_size
# containers
# bootstrap image size
img_size2 <- array(NA, dim = c(length(alpha_vec), max_size, N, M))
slope_est2 <- array(NA, dim = c(length(alpha_vec), max_size, N, M))
# bootstrap
pb <- txtProgressBar(0, length(alpha_vec)*max_size*N, style = 3)
ct <- 0
t1 <- Sys.time()
for(k in seq_along(alpha_vec)){ # filter size for data generation
for(b in 1:max_size){ # block size for bootstrap
for(i in 1:N){ # for each subject
# A matrix of observed outcome
this_df <- df_subj_wt %>% filter(alpha==alpha_vec[k] & id == i)
Y_mat <- matrix(this_df$Y2, nS, nS)
# divide the matrix into blocks
rblock <- (row(Y_mat)-1)%/%b+1
cblock <- (col(Y_mat)-1)%/%b+1
block_id_mat <- (rblock-1)*max(cblock) + cblock
nblock <- max(block_id_mat)
# sample blocks
# sample the same number of blocks as the original image
this_df$block_id <- as.vector(block_id_mat)
block_list <- split(this_df, f = this_df$block_id)
for(m in 1:M){
boot_block <- sample(1:nblock, size = nblock, replace = T)
boot_df <- bind_rows(block_list[boot_block])
img_size2[k, b, i, m] <- nrow(boot_df)
# fit model
boot_lm <- summary(lm(Y2~Y1, data = boot_df))$coefficients
slope_est2[k, b, i, m] <- boot_lm["Y1", "Estimate"]
}
ct <- ct+1
setTxtProgressBar(pb, ct)
}
}
}
t2 <- Sys.time()
# check expectation of image size
# apply(img_size[ , , , ], MARGIN = 1:3, FUN = mean)
# mean(img_size)
# calculate Type I error
lqt_mat2 <- apply(slope_est2[ , , , ], MARGIN = 1:3, FUN =quantile, probs = 0.025)
uqt_mat2 <- apply(slope_est2[ , , , ], MARGIN = 1:3, FUN =quantile, probs = 0.975)
reject_mat2 <- 0 < lqt_mat2 | uqt_mat2 < 0
typeIerror2 <- apply(reject_mat2, 1:2, mean)
typeIerror2 <- data.frame(alpha = alpha_vec, typeIerror2)
typeIerror2 %>% pivot_longer(starts_with("X")) %>%
mutate(bsize = as.numeric(gsub("X", "", name))) %>%
mutate(alpha=fct_rev(as.factor(alpha))) %>%
ggplot()+
geom_line(aes(x=bsize, y=value, group=as.factor(alpha), col=as.factor(alpha)))+
geom_point(aes(x=bsize, y=value, group=as.factor(alpha), col=as.factor(alpha)))+
geom_hline(yintercept = 0.05, col = "black", linetype = "dashed")+
labs(x="Block size", y="Type I error", col = "Weight matrix")+
scale_x_continuous(breaks = 1:max_size)
# mean slope estimates
slope_mean2 <- apply(slope_est2, MARGIN = 1:3, FUN = mean)
array(aperm(slope_mean2, perm = c(1, 3, 2)),
dim =c(N*length(alpha_vec), max_size)) %>%
data.frame() %>%
mutate(id = rep(1:N, each = length(alpha_vec)),
alpha = rep(alpha_vec, N),
.before=1) %>%
pivot_longer(starts_with("X")) %>%
mutate(bsize = as.numeric(gsub("X", "", name))) %>%
mutate(alpha=fct_rev(as.factor(alpha))) %>%
ggplot()+
geom_boxplot(aes(x=bsize, y=value, group = bsize))+
geom_jitter(aes(x=bsize, y=value), size = 0.2)+
facet_wrap(~alpha, ncol = 1)+
labs(x="Block size", title="Bootstrap mean of slope estimates")+
scale_x_continuous(breaks = 1:max_size)+
geom_hline(yintercept = 0, col="red", linetype = "dashed")
# mean slope estimates
slope_sd2 <- apply(slope_est2, MARGIN = 1:3, FUN = sd)
array(aperm(slope_sd2, perm = c(1, 3, 2)),
dim =c(N*length(alpha_vec), max_size)) %>%
data.frame() %>%
mutate(id = rep(1:N, each = length(alpha_vec)),
alpha = rep(alpha_vec, N),
.before=1) %>%
pivot_longer(starts_with("X")) %>%
mutate(bsize = as.numeric(gsub("X", "", name))) %>%
mutate(alpha=fct_rev(as.factor(alpha))) %>%
ggplot()+
geom_boxplot(aes(x=bsize, y=value, group = bsize))+
geom_jitter(aes(x=bsize, y=value), size = 0.2)+
facet_wrap(~alpha, ncol = 1)+
labs(x="Block size", title="Standard deviation of bootstrap estimates")+
scale_x_continuous(breaks = 1:max_size)
array(slope_est2[,,15,], dim = c(length(alpha_vec)*max_size, M)) %>%
data.frame() %>%
mutate(alpha = rep(alpha_vec, max_size),
bsize = rep(1:max_size, each = length(alpha_vec)),
.before = 1) %>%
pivot_longer(starts_with("X")) %>%
mutate(alpha=fct_rev(as.factor(alpha))) %>%
ggplot()+
geom_boxplot(aes(x=bsize, y=value, group=bsize))+
geom_jitter(aes(x=bsize, y=value, group=bsize), size=0.2)+
facet_wrap(~alpha, ncol=1)+
labs(x="Block size", title = "Bootstrap slope estiamtes of subject 15")+
geom_hline(yintercept = 0, col="red", linetype = "dashed")
In this section, we with to generate two spatial-dependent variables that are correlated with each other. Let’s say our generation is also based only on moving average of white noise field, we update the data generation scheme as follows:
\[\begin{aligned} Y_{1i}(s_1, s_2, t) &=\epsilon_{1i}(s_1, s_2, t) \\ Y_{2i}(s_1, s_2, t) &=\beta_0+\beta_1 Y_{1i}(s_1, s_2, t) + \epsilon_{2i}(s_1, s_2, t) \\ \end{aligned}\]
# set up space-time grid
# generate a 2D image of 32 by 32
# nS <- 32
# Nf <- 1000 # generate 1000 subjects over all, but use only 100 for the block bootstrap
# N <- 100
df_h1_k <- expand_grid(ksize = ksize_vec, id=1:Nf, s2=1:nS, s1 = 1:nS)
beta_true <- c(0, 1) # true coefficients
df_h1_k$Y1 <- df_h1_k$Y2 <- NA
# generate individual scores
# true_xi <- matrix(rnorm(2*N, 0, 1.5), nrow = N, ncol = 2)
# true_zeta <- matrix(rnorm(2*N, 0, 1.5), nrow = N, ncol = 2)
# generate outcomes
pb <- txtProgressBar(min=0, max=Nf, style = 3)
t1 <- Sys.time()
for(i in 1:Nf){ # fix a subject
for(k in seq_along(ksize_vec)){ # fix a time point
# generate Y1
## a moving average error
Y1 <- MA_rand_field(ksize_vec[k], nS)
# y1_it <- true_xi[i,1]*dist_it+true_xi[i,2]*this_t + as.vector(ma_err1)
df_h1_k$Y1[df_h1_k$ksize==ksize_vec[k] & df_h1_k$id==i] <- as.vector(Y1)
# generate Y2
## a moving average error
Y2_err <- MA_rand_field(ksize_vec[k], nS)
Y2_err <- as.vector(Y2_err)
Y2_mat <- matrix(c(rep(1, nS^2), as.vector(Y1)), ncol=2, byrow = F)
df_h1_k$Y2[df_h1_k$ksize==ksize_vec[k] & df_h1_k$id==i] <- Y2_mat %*% beta_true+Y2_err
}
setTxtProgressBar(pb, i)
}
t2 <- Sys.time()
close(pb)
# save data
# save(df_subj, file = here("Data/sim_data_ksize.RData"))
df_h1_k %>%
filter(id==15) %>%
pivot_longer(starts_with("Y")) %>%
ggplot()+
geom_tile(aes(x=s1, y=s2, fill = value))+
facet_grid(cols=vars(ksize), rows = vars(name))+
labs(title = "Generated data of subject ID = 15")
df_h1_k %>%
filter(id %in% sample(1:N, size = 4)) %>%
ggplot(aes(x=Y1, y=Y2))+
geom_point(size = 0.2)+
geom_smooth(formula = 'y~x', method = "lm")+
stat_cor(method = "pearson")+
facet_grid(rows = vars(id), cols = vars(ksize))+
labs(title = "Perason correlation of four subject")
Since data is generated under the alternative hypothesis, we calculate power (true rejection rate) in this case.
It is expected that LM would end up with all rejection, considering the spatial correlation
lm_err <- data.frame(ksize = ksize_vec)
N_vec <- c(100, 200, 500, 1000)
for(n in seq_along(N_vec)){
err_n <- df_h1_k %>%
filter(id <= N_vec[n]) %>%
group_by(id, ksize) %>%
group_modify(~{
fit_lm <- lm(Y2~Y1, data = .x)
data.frame(beta = coef(fit_lm)["Y1"],
pval = summary(fit_lm)$coefficients["Y1", "Pr(>|t|)"])
}) %>%
mutate(reject = pval < 0.05) %>%
group_by(ksize) %>%
summarise_at("reject", mean)
lm_err[, paste0("N", N_vec[n])] <- err_n$reject
}
lm_err %>%
rename("Filter size" = "ksize") %>%
kable(table.attr = "style=\"color:black;\"") %>%
kable_styling(full_width = F) %>%
add_header_above(c(" " = 1, "Power" = "4"))
| Filter size | N100 | N200 | N500 | N1000 |
|---|---|---|---|---|
| 1 | 1 | 1 | 1 | 1 |
| 3 | 1 | 1 | 1 | 1 |
| 5 | 1 | 1 | 1 | 1 |
| 7 | 1 | 1 | 1 | 1 |
| 9 | 1 | 1 | 1 | 1 |
# subset 100 subjects
df_h1_k <- df_h1_k %>% filter(id %in% 1:N)
# containers
# bootstrap image size
slope_est3 <- array(NA, dim = c(length(ksize_vec), max_size, N, M))
# bootstrap
pb <- txtProgressBar(0, length(ksize_vec)*max_size*N, style = 3)
ct <- 0
t1 <- Sys.time()
for(k in seq_along(ksize_vec)){ # filter size for data generation
for(b in 1:max_size){ # block size for bootstrap
for(i in 1:N){ # for each subject
# A matrix of observed outcome
this_df <- df_h1_k %>% filter(ksize==ksize_vec[k] & id == i)
# block, bootstrap
slope_est3[k, b, i, ] <- BlockBoot(df = this_df, bsize = b, M=M, nS=nS)
ct <- ct+1
setTxtProgressBar(pb, ct)
}
}
}
t2 <- Sys.time()
# calculate Type I error
lqt_mat3 <- apply(slope_est3[ , , , ], MARGIN = 1:3, FUN =quantile, probs = 0.025)
uqt_mat3 <- apply(slope_est3[ , , , ], MARGIN = 1:3, FUN =quantile, probs = 0.975)
cover_mat3 <- lqt_mat3 < 1 & 1 < uqt_mat3
cover_rate3 <- apply(cover_mat3, 1:2, mean)
cover_rate3 <- data.frame(ksize = ksize_vec, cover_rate3)
cover_rate3 %>% pivot_longer(starts_with("X")) %>%
mutate(bsize = as.numeric(gsub("X", "", name))) %>%
ggplot()+
geom_line(aes(x=bsize, y=value, group=as.factor(ksize), col=as.factor(ksize)))+
geom_point(aes(x=bsize, y=value, group=as.factor(ksize), col=as.factor(ksize)))+
geom_hline(yintercept = 0.95, col = "black", linetype = "dashed")+
labs(x="Block size", y="Coverate rate", col = "Filter size")+
scale_x_continuous(breaks = 1:max_size)
Below I show the mean and standard deviation of bootstrap slope estimates for all 100 subjects (each point represents a subject).
# mean slope estimates
slope_mean3 <- apply(slope_est3, MARGIN = 1:3, FUN = mean)
array(aperm(slope_mean3, perm = c(1, 3, 2)),
dim =c(N*length(ksize_vec), max_size)) %>%
data.frame() %>%
mutate(id = rep(1:N, each = length(ksize_vec)),
ksize = rep(ksize_vec, N),
.before=1) %>%
pivot_longer(starts_with("X")) %>%
mutate(bsize = as.numeric(gsub("X", "", name))) %>%
ggplot()+
geom_boxplot(aes(x=bsize, y=value, group = bsize))+
geom_jitter(aes(x=bsize, y=value), size = 0.2)+
facet_wrap(~ksize, ncol = 1)+
labs(x="Block size", title="Bootstrap mean of slope estimates")+
scale_x_continuous(breaks = 1:max_size)+
geom_hline(yintercept = 1, col="red", linetype = "dashed")
# mean slope estimates
slope_sd3 <- apply(slope_est3, MARGIN = 1:3, FUN = sd)
array(aperm(slope_sd3, perm = c(1, 3, 2)),
dim =c(N*length(ksize_vec), max_size)) %>%
data.frame() %>%
mutate(id = rep(1:N, each = length(ksize_vec)),
ksize = rep(ksize_vec, N),
.before=1) %>%
pivot_longer(starts_with("X")) %>%
mutate(bsize = as.numeric(gsub("X", "", name))) %>%
ggplot()+
geom_boxplot(aes(x=bsize, y=value, group = bsize))+
geom_jitter(aes(x=bsize, y=value), size = 0.2)+
facet_wrap(~ksize, ncol = 1)+
labs(x="Block size", title="Standard deviation of bootstrap estimates")+
scale_x_continuous(breaks = 1:max_size)
Below I also showed an example of the bootstrap slope estimates of one subject.
array(slope_est3[,,15,], dim = c(length(ksize_vec)*max_size, M)) %>%
data.frame() %>%
mutate(ksize = rep(ksize_vec, max_size),
bsize = rep(1:max_size, each = length(ksize_vec)),
.before = 1) %>%
pivot_longer(starts_with("X")) %>%
ggplot()+
geom_boxplot(aes(x=bsize, y=value, group=bsize))+
geom_jitter(aes(x=bsize, y=value, group=bsize), size=0.2)+
facet_wrap(~ksize, ncol=1)+
labs(x="Block size", title = "Bootstrap slope estiamtes of subject 15")+
geom_hline(yintercept = 1, col="red", linetype = "dashed")
# set up space-time grid
# generate a 2D image of 32 by 32
# nS <- 32
# Nf <- 1000 # generate 1000 subjects over all, but use only 100 for the block bootstrap
# N <- 100
df_h1_wt <- expand_grid(alpha=alpha_vec, id=1:Nf, s2=1:nS, s1 = 1:nS)
beta_true <- c(0, 1) # true coefficients
df_h1_wt$Y2 <- df_h1_wt$Y1 <- NA
# generate outcomes
pb <- txtProgressBar(min=0, max=Nf, style = 3)
t1 <- Sys.time()
for(i in 1:Nf){ # fix a subject
for(k in seq_along(alpha_vec)){ # fix a weight matrix
# generate Y1
## a moving average error
Y1 <- MWA_rand_field(kf = 5, ki = 32, wt = wt_dist[,,k])
df_h1_wt$Y1[df_h1_wt$alpha==alpha_vec[k] & df_h1_wt$id==i] <- as.vector(Y1)
# generate Y2
Y2_err <- MWA_rand_field(kf = 5, ki = 32, wt = wt_dist[,,k])
Y2_err <- as.vector(Y2_err)
Y2_mat <- matrix(c(rep(1, nS^2), as.vector(Y1)), ncol=2, byrow = F)
df_h1_wt$Y2[df_h1_wt$alpha==alpha_vec[k] & df_h1_wt$id==i] <- Y2_mat %*% beta_true+Y2_err
}
setTxtProgressBar(pb, i)
}
t2 <- Sys.time()
close(pb)
# save data
# save(df_subj2, file = here("Data/sim_data_wt.RData"))
df_h1_wt %>%
filter(id==15) %>%
mutate(alpha=fct_rev(as.factor(alpha))) %>%
pivot_longer(starts_with("Y")) %>%
ggplot()+
geom_tile(aes(x=s1, y=s2, fill = value))+
facet_grid(cols=vars(alpha), rows = vars(name))+
labs(title = "Generated data of subject ID = 15")
df_h1_wt %>%
# filter(id==1) %>%
filter(id %in% sample(1:N, size = 4)) %>%
mutate(alpha=fct_rev(as.factor(alpha))) %>%
ggplot(aes(x=Y1, y=Y2))+
geom_point(size = 0.2)+
geom_smooth(formula = 'y~x', method = "lm")+
stat_cor(method = "pearson")+
facet_grid(rows = vars(id), cols = vars(alpha))+
labs(title = "Perason correlation of four subject")
lm_err4 <- data.frame(alpha = alpha_vec)
# N_vec <- c(100, 200, 500, 1000)
for(n in seq_along(N_vec)){
err_n <- df_h1_wt %>%
filter(id <= N_vec[n]) %>%
group_by(id, alpha) %>%
group_modify(~{
fit_lm <- lm(Y2~Y1, data = .x)
data.frame(beta = coef(fit_lm)["Y1"],
pval = summary(fit_lm)$coefficients["Y1", "Pr(>|t|)"])
}) %>%
mutate(reject = pval < 0.05) %>%
group_by(alpha) %>%
summarise_at("reject", mean)
lm_err4[, paste0("N", N_vec[n])] <- err_n$reject
}
lm_err4 %>%
# rename("Alpha" = "ksize") %>%
arrange(desc(alpha)) %>%
kable(table.attr = "style=\"color:black;\"") %>%
kable_styling(full_width = F) %>%
add_header_above(c(" " = 1, "Power" = "4"))
| alpha | N100 | N200 | N500 | N1000 |
|---|---|---|---|---|
| 5.0 | 1 | 1 | 1 | 1 |
| 2.5 | 1 | 1 | 1 | 1 |
| 1.0 | 1 | 1 | 1 | 1 |
| 0.5 | 1 | 1 | 1 | 1 |
| 0.0 | 1 | 1 | 1 | 1 |
We will follow the same set up as Section 2.2.
# M <- 1000 # number of boostraps
# N <- 100 # sample size
df_h1_wt <- df_h1_wt %>% filter(id %in% 1:N)
# max_size
# containers
# bootstrap image size
# img_size4 <- array(NA, dim = c(length(alpha_vec), max_size, N, M))
slope_est4 <- array(NA, dim = c(length(alpha_vec), max_size, N, M))
# bootstrap
pb <- txtProgressBar(0, length(alpha_vec)*max_size*N, style = 3)
ct <- 0
t1 <- Sys.time()
for(k in seq_along(alpha_vec)){ # filter size for data generation
for(b in 1:max_size){ # block size for bootstrap
for(i in 1:N){ # for each subject
# A matrix of observed outcome
this_df <- df_h1_wt %>% filter(alpha==alpha_vec[k] & id == i)
# block bootstrap
slope_est4[k, b, i, ] <- BlockBoot(df = this_df, bsize = b, M=M, nS=nS)
ct <- ct+1
setTxtProgressBar(pb, ct)
}
}
}
t2 <- Sys.time()
# calculate Type I error
lqt_mat4 <- apply(slope_est4[ , , , ], MARGIN = 1:3, FUN =quantile, probs = 0.025)
uqt_mat4 <- apply(slope_est4[ , , , ], MARGIN = 1:3, FUN =quantile, probs = 0.975)
cover_mat4 <- lqt_mat4 < 1 & 1 < uqt_mat4
cover_rate4 <- apply(cover_mat4, 1:2, mean)
cover_rate4 <- data.frame(alpha = alpha_vec, cover_rate4)
cover_rate4 %>% pivot_longer(starts_with("X")) %>%
mutate(bsize = as.numeric(gsub("X", "", name))) %>%
mutate(alpha=fct_rev(as.factor(alpha))) %>%
ggplot()+
geom_line(aes(x=bsize, y=value, group=as.factor(alpha), col=as.factor(alpha)))+
geom_point(aes(x=bsize, y=value, group=as.factor(alpha), col=as.factor(alpha)))+
geom_hline(yintercept = 0.95, col = "black", linetype = "dashed")+
labs(x="Block size", y="Coverate rate", col = "Weight matrix")+
scale_x_continuous(breaks = 1:max_size)
Below I show the mean and standard deviation of bootstrap slope estimates for all 100 subjects (each point represents a subject).
# mean slope estimates
slope_mean4 <- apply(slope_est4, MARGIN = 1:3, FUN = mean)
array(aperm(slope_mean4, perm = c(1, 3, 2)),
dim =c(N*length(alpha_vec), max_size)) %>%
data.frame() %>%
mutate(id = rep(1:N, each = length(alpha_vec)),
alpha = rep(alpha_vec, N),
.before=1) %>%
pivot_longer(starts_with("X")) %>%
mutate(bsize = as.numeric(gsub("X", "", name))) %>%
mutate(alpha=fct_rev(as.factor(alpha))) %>%
ggplot()+
geom_boxplot(aes(x=bsize, y=value, group = bsize))+
geom_jitter(aes(x=bsize, y=value), size = 0.2)+
facet_wrap(~alpha, ncol = 1)+
labs(x="Block size", title="Bootstrap mean of slope estimates")+
scale_x_continuous(breaks = 1:max_size)+
geom_hline(yintercept = 1, col="red", linetype = "dashed")
# mean slope estimates
slope_sd4 <- apply(slope_est4, MARGIN = 1:3, FUN = sd)
array(aperm(slope_sd4, perm = c(1, 3, 2)),
dim =c(N*length(alpha_vec), max_size)) %>%
data.frame() %>%
mutate(id = rep(1:N, each = length(alpha_vec)),
alpha = rep(alpha_vec, N),
.before=1) %>%
pivot_longer(starts_with("X")) %>%
mutate(bsize = as.numeric(gsub("X", "", name))) %>%
mutate(alpha=fct_rev(as.factor(alpha))) %>%
ggplot()+
geom_boxplot(aes(x=bsize, y=value, group = bsize))+
geom_jitter(aes(x=bsize, y=value), size = 0.2)+
facet_wrap(~alpha, ncol = 1)+
labs(x="Block size", title="Standard deviation of bootstrap estimates")+
scale_x_continuous(breaks = 1:max_size)
Below I also showed an example of the bootstrap slope estimates of one subject.
array(slope_est4[,,15,], dim = c(length(alpha_vec)*max_size, M)) %>%
data.frame() %>%
mutate(alpha = rep(alpha_vec, max_size),
bsize = rep(1:max_size, each = length(ksize_vec)),
.before = 1) %>%
pivot_longer(starts_with("X")) %>%
mutate(alpha=fct_rev(as.factor(alpha))) %>%
ggplot()+
geom_boxplot(aes(x=bsize, y=value, group=bsize))+
geom_jitter(aes(x=bsize, y=value, group=bsize), size=0.2)+
facet_wrap(~alpha, ncol=1)+
labs(x="Block size", title = "Bootstrap slope estiamtes of subject 15")+
geom_hline(yintercept = 1, col="red", linetype = "dashed")
Why couldn’t the block bootstrap approach reach nomial type I error/coverage rate? One speculation is that through generating the two outcomes separately from the same mechanism to introduce spacial intercorrelation, we introduced correlation between the two variables.
To explore that, I wanna look into the residuals after susbtrating the effect of \(Y_1\) on \(Y_2\), in the original image and the bootstrap samples.
Let’s start with one subject (subject 15). I will do the block bootstrap with different block size, and visualize the residual intercorrelation. Putting the bootstrap blocks back to its original 2D format is in fact a very challenging problem. I still haven’t thought of good ways to do that.
df_id15 <- df_subj %>% filter(id == 15)
# simple linear regression
res_id15 <- df_id15 %>% group_by(ksize) %>%
group_modify(~{
fit_lm <- lm(Y2~Y1, data = .x)
data.frame(res = fit_lm$residuals)
}) %>% ungroup()
res_id15 <- res_id15 %>% mutate(s1 = df_id15$s1, s2 = df_id15$s2)
res_id15 %>% ggplot()+
geom_tile(aes(x=s1, y=s2, fill=res))+
facet_wrap(~ksize)
# containers
slope_est5 <- matrix(NA, nrow = length(ksize_vec), ncol = max_size)
res_list <- list()
for(k in seq_along(ksize_vec)){ # filter size for data generation
res_list[[k]] <- list()
# A matrix of observed outcome
this_df <- df_id15 %>% filter(ksize==ksize_vec[k])
Y_mat <- matrix(this_df$Y2, nS, nS)
for(b in 1:max_size){ # block size for bootstrap
# divide the matrix into blocks
rblock <- (row(Y_mat)-1)%/%b+1
cblock <- (col(Y_mat)-1)%/%b+1
block_id_mat <- (rblock-1)*max(cblock) + cblock
nblock <- max(block_id_mat)
# sample blocks
# sample the same number of blocks as the original image
this_df$block_id <- as.vector(block_id_mat)
block_list <- split(this_df, f = this_df$block_id)
# for(m in 1:M){
boot_block <- sample(1:nblock, size = nblock, replace = T)
boot_df <- block_list[boot_block]
boot_df <- bind_rows(boot_df)
# fit model
boot_lm <- lm(Y2~Y1, data = boot_df)
slope_est5[k, b] <- boot_lm$coefficients["Y1"]
boot_df$res <- boot_lm$residuals
# put residuals back to its original 2D format
boot_df %>% mutate(block_id = as.character(block_id)) %>%
mutate(block_id = factor(block_id, levels = levels(block_id), labels = 1:nblock))
}
}
t2 <- Sys.time()
boot_df %>% arrange(s1, s2)
# put the residuals back in the 2D image
for()
boot_img <- matrix(NA, nS, nS)
df_try <- res_list[[5]][[3]] %>%
group_by(block_id) %>%
mutate(s1 = s1-min(s1)+1,
s2 = s2-min(s2)+1) %>%
mutate(ncol = max(s2), nrow = max(s1))
table(df_try$s1)